In [1]:
#importando as bibliotécas
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
import requests
import joblib

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from collections import Counter
In [2]:
#URL do dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
dataframe = pd.read_csv(url, delimiter=';')
dataframe['quality'] = ['Bom' if x >= 6 else 'Ruim' for x in dataframe['quality']]

#faz o download do arquivo
response = requests.get(url)
content = response.content

#salva o arquivo no ambiente da IDE
with open('winequality-red.csv', 'wb') as file:
    file.write(content)
In [3]:
#importa o dataset
dataframe = pd.read_csv('winequality-red.csv', delimiter=';')
dataframe['quality'] = ['Bom' if x >= 6 else 'Ruim' for x in dataframe['quality']]
In [4]:
#deleta todas as linhas repetidas: 1599 - 1359 = 240 linhas excluídas
duplicados = dataframe.duplicated().sum(); print(duplicados, "linhas excluídas")
df = dataframe.drop_duplicates()

#índice das linhas não está sequencial: exclui as linhas que contém valores nulos e reindexa as linhas remanescentes para que tenham índices contínuos a partir do zero
df = df.dropna().reset_index(drop=True)
df
240 linhas excluídas
Out[4]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 Ruim
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 Ruim
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 Ruim
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 Bom
4 7.4 0.660 0.00 1.8 0.075 13.0 40.0 0.99780 3.51 0.56 9.4 Ruim
... ... ... ... ... ... ... ... ... ... ... ... ...
1354 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 Bom
1355 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 Ruim
1356 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 Bom
1357 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 Ruim
1358 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 Bom

1359 rows × 12 columns

In [5]:
#transforma dados categóricos em dados numéricos e retorna esses dados como um array NumPy: 'Bom' e 'Ruim' se transforma em uns e zeros
le = LabelEncoder()
quality = le.fit_transform(df['quality'])

#transforma o array em dataframe novamente
quality = pd.DataFrame(quality, columns=['quality'])

#'Bom' e 'Ruim' é trocado por uns zeros
df = df.drop('quality', axis=1).join(quality)
df
Out[5]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 1
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 1
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 1
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 0
4 7.4 0.660 0.00 1.8 0.075 13.0 40.0 0.99780 3.51 0.56 9.4 1
... ... ... ... ... ... ... ... ... ... ... ... ...
1354 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 0
1355 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 1
1356 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 0
1357 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 1
1358 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 0

1359 rows × 12 columns

In [6]:
#verificar se há valores ausentes em todas as colunas
nulos = df.isnull().sum()
print(nulos)
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
In [7]:
#define o limite de corte para remoção dos outliers
limite = 1.5

#itera sobre cada coluna do conjunto de dados
for coluna in df.columns:
    #calcula o IQR
    Q1 = df[coluna].quantile(0.25)
    Q3 = df[coluna].quantile(0.75)
    IQR = Q3 - Q1

    #define os limites de corte
    lim_sup = Q3 + limite*IQR
    lim_inf = Q1 - limite*IQR

    #conta o número de outliers acima e abaixo dos limites de corte
    n_outliers_sup = len(df[df[coluna] > lim_sup])
    n_outliers_inf = len(df[df[coluna] < lim_inf])

    #exibe o número de outliers encontrados para cada variável
    print(f"Variável '{coluna}' possui {n_outliers_sup} outliers acima do limite superior e {n_outliers_inf} outliers abaixo do limite inferior.")
Variável 'fixed acidity' possui 41 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'volatile acidity' possui 19 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'citric acid' possui 1 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'residual sugar' possui 126 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'chlorides' possui 83 outliers acima do limite superior e 4 outliers abaixo do limite inferior.
Variável 'free sulfur dioxide' possui 26 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'total sulfur dioxide' possui 45 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'density' possui 17 outliers acima do limite superior e 18 outliers abaixo do limite inferior.
Variável 'pH' possui 17 outliers acima do limite superior e 11 outliers abaixo do limite inferior.
Variável 'sulphates' possui 55 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'alcohol' possui 12 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
Variável 'quality' possui 0 outliers acima do limite superior e 0 outliers abaixo do limite inferior.
In [8]:
colunas = ["chlorides", "residual sugar", "sulphates"]

#define o limite de corte para remoção dos outliers
limite = 1.5

#itera sobre as colunas
for coluna in colunas:
    #calcula o IQR
    Q1 = df[coluna].quantile(0.25)
    Q3 = df[coluna].quantile(0.75)
    IQR = Q3 - Q1

    #define os limites de corte
    lim_sup = Q3 + limite*IQR
    lim_inf = Q1 - limite*IQR

    #remove os outliers da coluna
    df = df[(df[coluna] >= lim_inf) & (df[coluna] <= lim_sup)]

#exibe o número de observações antes e depois da remoção dos outliers
print("Número de observações antes da remoção de outliers:", len(pd.read_csv("winequality-red.csv")) - duplicados)
print("Número de observações após a remoção de outliers:", len(df))

#índice das linhas não está sequencial: exclui as linhas que contém valores nulos e reindexa as linhas remanescentes para que tenham índices contínuos a partir do zero
df = df.dropna().reset_index(drop=True)
df
Número de observações antes da remoção de outliers: 1359
Número de observações após a remoção de outliers: 1122
Out[8]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 1
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 1
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 1
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 0
4 7.4 0.660 0.00 1.8 0.075 13.0 40.0 0.99780 3.51 0.56 9.4 1
... ... ... ... ... ... ... ... ... ... ... ... ...
1117 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 0
1118 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 1
1119 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 0
1120 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 1
1121 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 0

1122 rows × 12 columns

In [9]:
#descrição estatística resumida dos dados contidos
'''
count: o número de valores não nulos na coluna.
mean: a média dos valores na coluna.
std: o desvio padrão dos valores na coluna.
min: o valor mínimo na coluna.
25%: o valor do primeiro quartil (25%).
50%: o valor do segundo quartil (50% ou a mediana).
75%: o valor do terceiro quartil (75%).
max: o valor máximo na coluna.
'''
df.drop(['quality'], axis=1).describe()
Out[9]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
count 1122.000000 1122.000000 1122.000000 1122.000000 1122.000000 1122.000000 1122.000000 1122.000000 1122.000000 1122.000000 1122.000000
mean 8.229768 0.527589 0.256658 2.197193 0.078112 15.697415 45.245989 0.996533 3.322442 0.629831 10.430273
std 1.666382 0.179943 0.187681 0.454798 0.015130 9.766202 30.680247 0.001779 0.150086 0.113503 1.057920
min 4.600000 0.120000 0.000000 0.900000 0.039000 1.000000 6.000000 0.990070 2.860000 0.330000 8.400000
25% 7.100000 0.390000 0.090000 1.900000 0.068000 8.000000 22.250000 0.995475 3.220000 0.550000 9.500000
50% 7.900000 0.520000 0.240000 2.100000 0.078000 14.000000 37.000000 0.996560 3.320000 0.610000 10.200000
75% 9.100000 0.640000 0.400000 2.500000 0.087000 21.000000 60.000000 0.997600 3.410000 0.700000 11.100000
max 15.000000 1.330000 0.750000 3.650000 0.122000 57.000000 165.000000 1.001400 4.010000 0.940000 14.000000
In [10]:
#separa data e target do dataset
X = df.drop(['quality'], axis=1)
y = df['quality']
In [11]:
#normalizando os dados
scaler = Normalizer()
scaler.fit(X)
Xz = scaler.transform(X)

#transforma o array em dataframe novamente
Xz = pd.DataFrame(Xz)
In [25]:
#variáveis para armazenar a melhor acurácia e o melhor ensemble
best_dt_accuracy = None
best_nb_accuracy = None
best_mlp_accuracy = None
best_ensemble_accuracy = 0
best_dt_predictions = None
best_nb_predictions = None
best_mlp_predictions = None
best_ensemble_predictions = None
best_y_test = None
best_dt_model = None
best_nb_model = None
best_mlp_model = None
dt_accuracies = []
nb_accuracies = []
mlp_accuracies = []
ensemble_accuracies = []

#treina 2000 vezes e seleciona o melhor resultado
for i in range(2000):

    #dividir o conjunto de dados em treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(Xz, y, test_size=0.3)

    #inicializar os modelos individuais
    dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=6)
    nb_model = GaussianNB()
    mlp_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3))

    #treinar cada modelo
    dt_model.fit(X_train, y_train)
    nb_model.fit(X_train, y_train)
    mlp_model.fit(X_train, y_train)

    dt_accuracy = dt_model.score(X_test, y_test)
    nb_accuracy = nb_model.score(X_test, y_test)
    mlp_accuracy = mlp_model.score(X_test, y_test)

    #fazer previsões para o conjunto de teste
    dt_predictions = dt_model.predict(X_test)
    nb_predictions = nb_model.predict(X_test)
    mlp_predictions = mlp_model.predict(X_test)

    #armazenar as acurácias dos modelos individuais
    dt_accuracies.append(dt_accuracy)
    nb_accuracies.append(nb_accuracy)
    mlp_accuracies.append(mlp_accuracy)

    #voto majoritário para decidir a resposta
    ensemble_predictions = []
    for j in range(len(dt_predictions)):
        votes = [dt_predictions[j], nb_predictions[j], mlp_predictions[j]]
        majority_vote = Counter(votes).most_common(1)[0][0]
        ensemble_predictions.append(majority_vote)

    #calcula a acurácia do ensemble
    ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
    ensemble_accuracies.append(ensemble_accuracy)

    #verifica se a acurácia do ensemble atual é maior do que a melhor acurácia
    if ensemble_accuracy > best_ensemble_accuracy:
        best_dt_accuracy = dt_accuracy
        best_nb_accuracy = nb_accuracy
        best_mlp_accuracy = mlp_accuracy
        best_ensemble_accuracy = ensemble_accuracy
        best_dt_predictions = dt_predictions
        best_nb_predictions = nb_predictions
        best_mlp_predictions = mlp_predictions
        best_ensemble_predictions = ensemble_predictions
        best_y_test = y_test
        best_dt_model = dt_model
        best_nb_model = nb_model
        best_mlp_model = mlp_model

#salvando os modelos
joblib.dump(best_dt_model, 'dt.joblib')
joblib.dump(best_nb_model, 'nb.joblib')
joblib.dump(best_mlp_model, 'mlp.joblib')

print("Sucesso!")
Sucesso!
In [26]:
#acurácia dos modelos
print("Acurácia da Decision Tree: {:.2f}%".format(best_dt_accuracy * 100))
print("Acurácia do Naive Bayes: {:.2f}%".format(best_nb_accuracy * 100))
print("Acurácia do MLP: {:.2f}%".format(best_mlp_accuracy * 100))
print("Acurácia do Ensemble: {:.2f}%".format(best_ensemble_accuracy * 100))
Acurácia da Decision Tree: 67.06%
Acurácia do Naive Bayes: 70.92%
Acurácia do MLP: 79.53%
Acurácia do Ensemble: 79.23%
In [27]:
#printa o melhor resultado
print("\nMatriz de confusão detalhada do Ensemble:\n", pd.crosstab(best_y_test, best_ensemble_predictions, rownames=['Real'], colnames=['Predito'], margins=True, margins_name='Todos'), "\n")
print("Relatório sobre a qualidade do Ensemble:\n", metrics.classification_report(best_y_test, best_ensemble_predictions, target_names=['Bom', 'Ruim']))
Matriz de confusão detalhada do Ensemble:
 Predito    0    1  Todos
Real                    
0        138   30    168
1         40  129    169
Todos    178  159    337 

Relatório sobre a qualidade do Ensemble:
               precision    recall  f1-score   support

         Bom       0.78      0.82      0.80       168
        Ruim       0.81      0.76      0.79       169

    accuracy                           0.79       337
   macro avg       0.79      0.79      0.79       337
weighted avg       0.79      0.79      0.79       337

In [28]:
#dataFrame com as respostas das três IAs mais o target
df_predict = pd.DataFrame({
    'Preditor Decision Tree': best_dt_predictions,
    'Preditor Naive Bayes': best_nb_predictions,
    'Preditor MLPClassifier': best_mlp_predictions,
    'Preditor Ensemble': best_ensemble_predictions,
    'Real': best_y_test
})

#pd.set_option('display.max_rows', None)
#pd.reset_option('display.max_rows')
df_predict
Out[28]:
Preditor Decision Tree Preditor Naive Bayes Preditor MLPClassifier Preditor Ensemble Real
757 0 1 0 0 0
681 0 1 0 0 0
508 1 1 1 1 1
542 1 1 1 1 1
196 1 1 1 1 1
... ... ... ... ... ...
264 0 1 0 0 1
99 1 1 1 1 1
1088 1 0 1 1 1
628 1 1 1 1 0
572 1 0 0 0 1

337 rows × 5 columns

In [29]:
#média e desvio padrão
print("Média: {:.2f}%". format(np.mean(ensemble_accuracies)*100))
print("Desvio padrão: {:.2f}%".format(np.std(ensemble_accuracies)*100))
#plota o gráfico
plt.hist(ensemble_accuracies, bins = 10)
plt.xlabel('Acurácia')
plt.ylabel('Frequência')
plt.title('Distribuição de Acurácia do Ensemble')
plt.show()
Média: 69.16%
Desvio padrão: 3.56%
No description has been provided for this image
In [30]:
#média e desvio padrão
print("Média: {:.2f}%". format(np.mean(ensemble_accuracies)*100))
print("Desvio padrão: {:.2f}%".format(np.std(ensemble_accuracies)*100))

#plota o gráfico
sns.distplot(ensemble_accuracies)
plt.xlabel('Acurácia')
plt.ylabel('Frequência')
plt.yticks([])
plt.title("Distribuição de Acurácia do Ensemble")
plt.show()
Média: 69.16%
Desvio padrão: 3.56%
No description has been provided for this image
In [31]:
#não é possível separar os dados a olho nu
sns.scatterplot(x=df['alcohol'], y=df['sulphates'], hue='quality', data=df)
Out[31]:
<Axes: xlabel='alcohol', ylabel='sulphates'>
No description has been provided for this image
In [32]:
#não é possível separar os dados a olho nu
#mapeando 'Ruim' para 0 e 'Bom' para 1
plt.scatter(df['alcohol'], df['sulphates'], c=df['quality'], cmap='coolwarm', alpha=0.7)
plt.xlabel('Álcool')
plt.ylabel('Sulfatos')
plt.title('Gráfico de Dispersão')
plt.show()
No description has been provided for this image
In [33]:
#não é possível separar os dados a olho nu
sns.pairplot(df, hue='quality')
plt.show()
No description has been provided for this image
In [34]:
#separa data e target do dataset
X = df.drop(['quality'], axis=1)
y = df['quality']

#colunas
columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
           'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
           'pH', 'sulphates', 'alcohol', 'quality']

#definindo os valores para uma única linha
good_values = [14.5, 0.15, 0.7, 3.2, 0.04, 50, 23, 0.990, 4, 0.9, 13.9, 0]
bad_values = [8, 1.2, 0.25, 2.0, 0.09, 8, 127, 1, 3.1, 0.35, 9, 1]

#criando o dataFrame
good = pd.DataFrame([good_values], columns=columns)
bad = pd.DataFrame([bad_values], columns=columns)

good_data = good.iloc[:, :11]  #selecionando as 11 primeiras colunas
bad_data = bad.iloc[:, :11]  #selecionando as 11 primeiras colunas

good_target = good['quality'] #selecionando a última coluna
bad_target = bad['quality'] #selecionando a última coluna
In [35]:
#normalizando os dados
scaler = Normalizer()
scaler.fit(good_data)
scaler.fit(bad_data)
good_z = scaler.transform(good_data)
bad_z = scaler.transform(bad_data)

#transforma o array em dataframe novamente
good_z = pd.DataFrame(good_z)
bad_z = pd.DataFrame(bad_z)
In [36]:
#variáveis para armazenar a melhor acurácia e o melhor ensemble
gbest_dt_accuracy = None
gbest_nb_accuracy = None
gbest_mlp_accuracy = None
gbest_ensemble_accuracy = 0
gbest_dt_predictions = None
gbest_nb_predictions = None
gbest_mlp_predictions = None
gbest_ensemble_predictions = None
gbest_y_test = None
gbest_dt_model = None
gbest_nb_model = None
gbest_mlp_model = None
gdt_accuracies = []
gnb_accuracies = []
gmlp_accuracies = []
gensemble_accuracies = []

#treina 20 vezes e seleciona o melhor resultado
for i in range(20):

    #inicializar os modelos individuais
    gdt_model = DecisionTreeClassifier(criterion='entropy', max_depth=6)
    gnb_model = GaussianNB()
    gmlp_model = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (3))

    #treinar cada modelo
    gdt_model.fit(Xz, y)
    gnb_model.fit(Xz, y)
    gmlp_model.fit(Xz, y)

    gdt_accuracy = gdt_model.score(good_z, good_target)
    gnb_accuracy = gnb_model.score(good_z, good_target)
    gmlp_accuracy = gmlp_model.score(good_z, good_target)

    #fazer previsões para o conjunto de teste
    gdt_predictions = gdt_model.predict(good_z)
    gnb_predictions = gnb_model.predict(good_z)
    gmlp_predictions = gmlp_model.predict(good_z)

    #armazenar as acurácias dos modelos individuais
    gdt_accuracies.append(gdt_accuracy)
    gnb_accuracies.append(gnb_accuracy)
    gmlp_accuracies.append(gmlp_accuracy)

    #voto majoritário para decidir a resposta
    gensemble_predictions = []
    for j in range(len(gdt_predictions)):
        gvotes = [gdt_predictions[j], gnb_predictions[j], gmlp_predictions[j]]
        gmajority_vote = Counter(gvotes).most_common(1)[0][0]
        gensemble_predictions.append(gmajority_vote)

    #calcula a acurácia do ensemble
    gensemble_accuracy = accuracy_score(good_target, gensemble_predictions)
    gensemble_accuracies.append(gensemble_accuracy)

    #verifica se a acurácia do ensemble atual é maior do que a melhor acurácia
    if gensemble_accuracy > gbest_ensemble_accuracy:
        gbest_dt_accuracy = gdt_accuracy
        gbest_nb_accuracy = gnb_accuracy
        gbest_mlp_accuracy = gmlp_accuracy
        gbest_ensemble_accuracy = gensemble_accuracy
        gbest_dt_predictions = gdt_predictions
        gbest_nb_predictions = gnb_predictions
        gbest_mlp_predictions = gmlp_predictions
        gbest_ensemble_predictions = gensemble_predictions
        gbest_dt_model = gdt_model
        gbest_nb_model = gnb_model
        gbest_mlp_model = gmlp_model

print("Sucesso!")
Sucesso!
In [37]:
#acurácia dos modelos
print("Acurácia da Decision Tree: {:.2f}%".format(gbest_dt_accuracy * 100))
print("Acurácia do Naive Bayes: {:.2f}%".format(gbest_nb_accuracy * 100))
print("Acurácia do MLP: {:.2f}%".format(gbest_mlp_accuracy * 100))
print("Acurácia do Ensemble: {:.2f}%".format(gbest_ensemble_accuracy * 100))
Acurácia da Decision Tree: 100.00%
Acurácia do Naive Bayes: 100.00%
Acurácia do MLP: 100.00%
Acurácia do Ensemble: 100.00%
In [38]:
print("\nMatriz de confusão detalhada do Ensemble:\n", pd.crosstab(good_target, gbest_ensemble_predictions, rownames=['Real'], colnames=['Predito'], margins=True, margins_name='Todos'), "\n")
print("Relatório sobre a qualidade do Ensemble:\n", metrics.classification_report(good_target, gbest_ensemble_predictions, labels=[0, 1], target_names=['Bom', 'Ruim']))
Matriz de confusão detalhada do Ensemble:
 Predito  0  Todos
Real             
0        1      1
Todos    1      1 

Relatório sobre a qualidade do Ensemble:
               precision    recall  f1-score   support

         Bom       1.00      1.00      1.00         1
        Ruim       0.00      0.00      0.00         0

    accuracy                           1.00         1
   macro avg       0.50      0.50      0.50         1
weighted avg       1.00      1.00      1.00         1

In [39]:
#dataFrame com as respostas das três IAs mais o target
gdf_predict = pd.DataFrame({
    'Preditor Decision Tree': gbest_dt_predictions,
    'Preditor Naive Bayes': gbest_nb_predictions,
    'Preditor MLPClassifier': gbest_mlp_predictions,
    'Preditor Ensemble': gbest_ensemble_predictions,
    'Real': good_target
})

#pd.set_option('display.max_rows', None)
#pd.reset_option('display.max_rows')
gdf_predict
Out[39]:
Preditor Decision Tree Preditor Naive Bayes Preditor MLPClassifier Preditor Ensemble Real
0 0 0 0 0 0
In [40]:
#média e desvio padrão
print("Média: {:.2f}%". format(np.mean(gensemble_accuracies)*100))
print("Desvio padrão: {:.2f}%".format(np.std(gensemble_accuracies)*100))
#plota o gráfico
plt.hist(gensemble_accuracies, bins = 10)
plt.xlabel('Acurácia')
plt.ylabel('Frequência')
plt.title('Distribuição de Acurácia do Ensemble')
plt.show()
Média: 100.00%
Desvio padrão: 0.00%
No description has been provided for this image
In [41]:
#média e desvio padrão
print("Média: {:.2f}%". format(np.mean(gensemble_accuracies)*100))
print("Desvio padrão: {:.2f}%".format(np.std(gensemble_accuracies)*100))

#plota o gráfico
sns.distplot(gensemble_accuracies)
plt.xlabel('Acurácia')
plt.ylabel('Frequência')
plt.yticks([])
plt.title("Distribuição de Acurácia do Ensemble")
plt.show()
Média: 100.00%
Desvio padrão: 0.00%
No description has been provided for this image
In [50]:
#variáveis para armazenar a melhor acurácia e o melhor ensemble
bbest_dt_accuracy = None
bbest_nb_accuracy = None
bbest_mlp_accuracy = None
bbest_ensemble_accuracy = 0
bbest_dt_predictions = None
bbest_nb_predictions = None
bbest_mlp_predictions = None
bbest_ensemble_predictions = None
bbest_y_test = None
bbest_dt_model = None
bbest_nb_model = None
bbest_mlp_model = None
bdt_accuracies = []
bnb_accuracies = []
bmlp_accuracies = []
bensemble_accuracies = []

#treina 20 vezes e seleciona o melhor resultado
for i in range(20):

    #inicializar os modelos individuais
    bdt_model = DecisionTreeClassifier(criterion='entropy', max_depth=6)
    bnb_model = GaussianNB()
    bmlp_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3))

    #treinar cada modelo
    bdt_model.fit(Xz, y)
    bnb_model.fit(Xz, y)
    bmlp_model.fit(Xz, y)

    bdt_accuracy = bdt_model.score(bad_z, bad_target)
    bnb_accuracy = bnb_model.score(bad_z, bad_target)
    bmlp_accuracy = bmlp_model.score(bad_z, bad_target)

    #fazer previsões para o conjunto de teste
    bdt_predictions = bdt_model.predict(bad_z)
    bnb_predictions = bnb_model.predict(bad_z)
    bmlp_predictions = bmlp_model.predict(bad_z)

    #armazenar as acurácias dos modelos individuais
    bdt_accuracies.append(bdt_accuracy)
    bnb_accuracies.append(bnb_accuracy)
    bmlp_accuracies.append(bmlp_accuracy)

    #voto majoritário para decidir a resposta
    bensemble_predictions = []
    for j in range(len(bdt_predictions)):
        bvotes = [bdt_predictions[j], bnb_predictions[j], bmlp_predictions[j]]
        bmajority_vote = Counter(bvotes).most_common(1)[0][0]
        bensemble_predictions.append(bmajority_vote)

    #calcula a acurácia do ensemble
    bensemble_accuracy = accuracy_score(bad_target, bensemble_predictions)
    bensemble_accuracies.append(bensemble_accuracy)

    #verifica se a acurácia do ensemble atual é maior do que a melhor acurácia
    if bensemble_accuracy > bbest_ensemble_accuracy:
        bbest_dt_accuracy = bdt_accuracy
        bbest_nb_accuracy = bnb_accuracy
        bbest_mlp_accuracy = bmlp_accuracy
        bbest_ensemble_accuracy = bensemble_accuracy
        bbest_dt_predictions = bdt_predictions
        bbest_nb_predictions = bnb_predictions
        bbest_mlp_predictions = bmlp_predictions
        bbest_ensemble_predictions = bensemble_predictions
        bbest_dt_model = bdt_model
        bbest_nb_model = bnb_model
        bbest_mlp_model = bmlp_model

print("Sucesso!")
Sucesso!
In [51]:
#acurácia dos modelos
print("Acurácia da Decision Tree: {:.2f}%".format(bbest_dt_accuracy * 100))
print("Acurácia do Naive Bayes: {:.2f}%".format(bbest_nb_accuracy * 100))
print("Acurácia do MLP: {:.2f}%".format(bbest_mlp_accuracy * 100))
print("Acurácia do Ensemble: {:.2f}%".format(bbest_ensemble_accuracy * 100))
Acurácia da Decision Tree: 100.00%
Acurácia do Naive Bayes: 100.00%
Acurácia do MLP: 100.00%
Acurácia do Ensemble: 100.00%
In [52]:
print("\nMatriz de confusão detalhada do Ensemble:\n", pd.crosstab(bad_target, bbest_ensemble_predictions, rownames=['Real'], colnames=['Predito'], margins=True, margins_name='Todos'), "\n")
print("Relatório sobre a qualidade do Ensemble:\n", metrics.classification_report(bad_target, bbest_ensemble_predictions, labels=[0, 1], target_names=['Bom', 'Ruim']))
Matriz de confusão detalhada do Ensemble:
 Predito  1  Todos
Real             
1        1      1
Todos    1      1 

Relatório sobre a qualidade do Ensemble:
               precision    recall  f1-score   support

         Bom       0.00      0.00      0.00         0
        Ruim       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       0.50      0.50      0.50         1
weighted avg       1.00      1.00      1.00         1

In [53]:
#dataFrame com as respostas das três IAs mais o target
bdf_predict = pd.DataFrame({
    'Preditor Decision Tree': bbest_dt_predictions,
    'Preditor Naive Bayes': bbest_nb_predictions,
    'Preditor MLPClassifier': bbest_mlp_predictions,
    'Preditor Ensemble': bbest_ensemble_predictions,
    'Real': bad_target
})

#pd.set_option('display.max_rows', None)
#pd.reset_option('display.max_rows')
bdf_predict
Out[53]:
Preditor Decision Tree Preditor Naive Bayes Preditor MLPClassifier Preditor Ensemble Real
0 1 1 1 1 1
In [54]:
#média e desvio padrão
print("Média: {:.2f}%". format(np.mean(bensemble_accuracies)*100))
print("Desvio padrão: {:.2f}%".format(np.std(bensemble_accuracies)*100))
#plota o gráfico
plt.hist(bensemble_accuracies, bins = 10)
plt.xlabel('Acurácia')
plt.ylabel('Frequência')
plt.title('Distribuição de Acurácia do Ensemble')
plt.show()
Média: 100.00%
Desvio padrão: 0.00%
No description has been provided for this image
In [55]:
#média e desvio padrão
print("Média: {:.2f}%". format(np.mean(bensemble_accuracies)*100))
print("Desvio padrão: {:.2f}%".format(np.std(bensemble_accuracies)*100))

#plota o gráfico
sns.distplot(bensemble_accuracies)
plt.xlabel('Acurácia')
plt.ylabel('Frequência')
plt.yticks([])
plt.title("Distribuição de Acurácia do Ensemble")
plt.show()
Média: 100.00%
Desvio padrão: 0.00%
No description has been provided for this image